{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Match all occurrences of the given pattern in the given text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "path_to_file = \"data/do.txt\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## approach 1: does not match across line breaks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# in the middle we have 2017.8.26, which looks like a date so \n",
    "# both the second and third elements should match one or \n",
    "# two-digit numbers (month and days)\n",
    "process_pat = re.compile(r'\\d{7}-\\d{2}\\.\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\.\\d{4}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['0001040-18.2017.8.26.0100']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test1 = \"0001040-18.2017.8.26.0100\" # from the instructions\n",
    "process_pat.findall(test1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['0001040-18.2017.10.1.0100']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test2 = \"0001040-18.2017.10.1.0100\" # matches single-digit days and double-digit months too\n",
    "process_pat.findall(test2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['0001040-18.2017.8.1.0100', '0001040-18.2017.8.23.0100']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test3 = \"0001040-18.2017.8.1.0100 foo bar baz 0001040-18.2017.8.23.0100\" # multiple occurrences\n",
    "process_pat.findall(test3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test4 = \"0001040-18.2017.8.222.0100\" # should NOT match\n",
    "process_pat.findall(test4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# numbers may be split across lines\n",
    "test5 = \"\"\"\n",
    "0001040-18.2017.8\n",
    ".26.0100\n",
    "\"\"\"\n",
    "\n",
    "# this will be solved in approach 2\n",
    "process_pat.findall(test5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8 distinct matches found: \n",
      "\n",
      "0033887-73.2017.8.26.0100\n",
      "0047167-14.2017.8.26.0100\n",
      "0001040-18.2017.8.26.0100\n",
      "0028605-54.2017.8.26.0100\n",
      "0032840-64.2017.8.26.0100\n",
      "0030677-14.2017.8.26.0100\n",
      "0026982-52.2017.8.26.0100\n",
      "0047169-81.2017.8.26.0100\n"
     ]
    }
   ],
   "source": [
    "def approach1(file,pattern):\n",
    "   \n",
    "    output = list()\n",
    "    \n",
    "    with open(file,\"r\") as f:\n",
    "        for line in f:\n",
    "            # there may be more than a single occurrence in the line so we must use findall\n",
    "            matches = pattern.findall(line)\n",
    "            \n",
    "            for match in matches:\n",
    "                output.append(match)\n",
    " \n",
    "    # remove duplicates                \n",
    "    return set(output)\n",
    "\n",
    "# running\n",
    "matches = approach1(path_to_file,process_pat)\n",
    "\n",
    "print(\"{} distinct matches found: \\n\".format(len(matches)))\n",
    "    \n",
    "for match in matches:    \n",
    "    print(match)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## approach 2:  match even across line breaks\n",
    "\n",
    "- read two lines at a time, so that we capture text which may have been split across lines\n",
    "\n",
    "- basic strategy: read two lines then rewind the file pointer 1 line (otherwise we would only be able to capture stuff every 2 lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16 distinct matches found: \n",
      "\n",
      "0033887-73.2017.8.26.0100\n",
      "0047167-14.2017.8.26.0100\n",
      "0001040-18.2017.8.26.0100\n",
      "1021197-29.2016.8.26.0100\n",
      "0028605-54.2017.8.26.0100\n",
      "1056022-04.2013.8.26.0100\n",
      "1071666-79.2016.8.26.0100\n",
      "0155113-55.2011.8.26.0100\n",
      "0030677-14.2017.8.26.0100\n",
      "0032840-64.2017.8.26.0100\n",
      "1055991-47.2014.8.26.0100\n",
      "0026982-52.2017.8.26.0100\n",
      "1128654-91.2014.8.26.0100\n",
      "0047169-81.2017.8.26.0100\n",
      "1058515-46.2016.8.26.0100\n",
      "1042408-29.2013.8.26.0100\n"
     ]
    }
   ],
   "source": [
    "def approach2(file,pattern):\n",
    "    \n",
    "    line_break = r'\\n'\n",
    "   \n",
    "    output = list()\n",
    "    \n",
    "    with open(file,\"r\") as f:           \n",
    "        while True:\n",
    "            \n",
    "            line1 = f.readline()   \n",
    "            last_position = f.tell()\n",
    "            line2 = f.readline()\n",
    "            \n",
    "            # end of file\n",
    "            if not line1: break\n",
    "                           \n",
    "            merged = re.sub(line_break,'',line1+line2)\n",
    "                       \n",
    "            # there may be more than a single occurrence in the line so we must use findall    \n",
    "            matches = pattern.findall(merged)\n",
    "            \n",
    "            for match in matches:\n",
    "                output.append(match)\n",
    "            \n",
    "            # rewind to position before last line was read\n",
    "            f.seek(last_position)\n",
    "    \n",
    "    # remove duplicates\n",
    "    return set(output)                \n",
    "            \n",
    "# running\n",
    "matches = approach2(path_to_file,process_pat)\n",
    "\n",
    "print(\"{} distinct matches found: \\n\".format(len(matches)))\n",
    "    \n",
    "for match in matches:    \n",
    "    print(match)        "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Global TF Kernel (Python 3)",
   "language": "python",
   "name": "global-tf-python-3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
